{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-19T01:48:53.128203Z",
     "start_time": "2025-02-19T01:48:52.798660Z"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "import re\n",
    "import json\n",
    "from typing import List, Optional, Tuple"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-19T01:48:53.141952Z",
     "start_time": "2025-02-19T01:48:53.139769Z"
    }
   },
   "outputs": [],
   "source": [
    "def normalize_trait(trait):\n",
    "    trait = '_'.join(trait.split())\n",
    "    normalized_trait = ''.join(trait.split(\"'\"))\n",
    "    return normalized_trait"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "## Get trait-condition pairs and get gene IoUs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-19T01:48:53.193068Z",
     "start_time": "2025-02-19T01:48:53.185923Z"
    },
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "with open(\"../metadata/task_info.json\", \"r\") as f:\n",
    "    info = json.load(f)\n",
    "t2g = {k: v['related_genes'] for k, v in info.items()}\n",
    "traits = sorted(list(t2g.keys()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-19T01:48:53.262518Z",
     "start_time": "2025-02-19T01:48:53.256090Z"
    }
   },
   "outputs": [],
   "source": [
    "len(traits)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-19T01:48:53.618206Z",
     "start_time": "2025-02-19T01:48:53.614552Z"
    }
   },
   "outputs": [],
   "source": [
    "len(t2g['Breast_Cancer'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-19T01:48:53.655507Z",
     "start_time": "2025-02-19T01:48:53.650605Z"
    }
   },
   "outputs": [],
   "source": [
    "cancer_traits = \\\n",
    "['Acute_Myeloid_Leukemia', 'Adrenocortical_Cancer', 'Bile_Duct_Cancer', 'Bladder_Cancer', 'Breast_Cancer', \n",
    "'Cervical_Cancer', 'Colon_and_Rectal_Cancer', 'Endometrioid_Cancer', 'Esophageal_Cancer', 'Glioblastoma', \n",
    "'Head_and_Neck_Cancer', 'Kidney_Chromophobe', 'Kidney_Clear_Cell_Carcinoma', 'Kidney_Papillary_Cell_Carcinoma', \n",
    "'Large_B-cell_Lymphoma', 'Liver_Cancer', 'Lower_Grade_Glioma', 'Lung_Cancer', 'Melanoma', 'Mesothelioma', \n",
    "'Ocular_Melanomas', 'Ovarian_Cancer', 'Pancreatic_Cancer', 'Pheochromocytoma_and_Paraganglioma', \n",
    "'Prostate_Cancer', 'Rectal_Cancer', 'Retinoblastoma', 'Sarcoma', 'Stomach_Cancer', 'Testicular_Cancer', \n",
    "'Thymoma', 'Thyroid_Cancer', 'Uterine_Carcinosarcoma', 'Uterine_Corpus_Endometrial_Carcinoma', \n",
    "'lower_grade_glioma_and_glioblastoma']\n",
    "len(cancer_traits)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-19T01:48:54.512205Z",
     "start_time": "2025-02-19T01:48:54.181571Z"
    },
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "data = []\n",
    "for t in traits:\n",
    "    for s in traits + ['Age', 'Gender']:\n",
    "        if t == s: continue\n",
    "        if s in ['Age', 'Gender']: \n",
    "            iou = 1.0\n",
    "        else:\n",
    "            len_union = len(set(t2g[t]).union(set(t2g[s])))\n",
    "            if len_union == 0: continue\n",
    "            iou = len(set(t2g[t]).intersection(set(t2g[s]))) / len_union\n",
    "        data.append({'Trait': t, 'Condition': s, 'IoU': iou})\n",
    "rel = pd.DataFrame(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-19T01:48:55.106088Z",
     "start_time": "2025-02-19T01:48:55.098771Z"
    }
   },
   "outputs": [],
   "source": [
    "rel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-19T01:48:56.125093Z",
     "start_time": "2025-02-19T01:48:56.121628Z"
    },
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 回头修订一下。\n",
    "\n",
    "condition_only = ['Vitamin_D_Levels', 'LDL_Cholesterol_Levels']\n",
    "male_traits = ['Prostate_Cancer', 'Testicular_Cancer']\n",
    "female_traits = ['Cervical_Cancer', 'Endometriosis', 'Endometrioid_Cancer', 'Uterine_Carcinosarcoma', 'Uterine_Corpus_Endometrial_Carcinoma', 'Ovarian_Cancer', 'Polycystic_Ovary_Syndrome'] \n",
    "gender_traits = male_traits + female_traits\n",
    "condition_all = ['Obesity', 'Hypertension']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-19T01:48:57.039089Z",
     "start_time": "2025-02-19T01:48:57.035670Z"
    }
   },
   "outputs": [],
   "source": [
    "assert all(t in traits for t in condition_only)\n",
    "assert all(t in traits for t in gender_traits)\n",
    "assert all(t in traits for t in condition_all)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-19T01:48:57.791984Z",
     "start_time": "2025-02-19T01:48:57.780053Z"
    },
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "rel = rel[~((rel['Trait'].isin(gender_traits)) & (rel['Condition'] == 'Gender'))]\n",
    "rel = rel[~rel['Trait'].isin(condition_only)]\n",
    "rel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-19T01:48:58.533728Z",
     "start_time": "2025-02-19T01:48:58.521997Z"
    },
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "rel = rel[~(rel['Trait'].isin(male_traits) & rel['Condition'].isin(female_traits))]\n",
    "rel = rel[~(rel['Trait'].isin(female_traits) & rel['Condition'].isin(male_traits))]\n",
    "rel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-19T01:48:59.054717Z",
     "start_time": "2025-02-19T01:48:59.044355Z"
    }
   },
   "outputs": [],
   "source": [
    "rel = rel[~((rel['Trait'].isin(cancer_traits)) & (rel['Condition'].isin(cancer_traits)))]\n",
    "rel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-19T01:48:59.835170Z",
     "start_time": "2025-02-19T01:48:59.819445Z"
    },
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "rel = rel.sort_values(by='IoU', ascending=False).reset_index().drop(columns=['index'])\n",
    "rel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-19T01:49:18.443315Z",
     "start_time": "2025-02-19T01:49:18.433248Z"
    },
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "selected = rel[(rel['IoU'] >= 0.20) | rel['Condition'].isin(condition_all)]\n",
    "selected"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-19T03:58:12.129625Z",
     "start_time": "2025-02-19T03:58:12.064547Z"
    }
   },
   "outputs": [],
   "source": [
    "# Create a new dictionary to store both genes and conditions for each trait\n",
    "new_task_info = {}\n",
    "\n",
    "# Iterate through each trait in t2g\n",
    "for trait in t2g:\n",
    "    # Get the conditions for this trait from the selected DataFrame\n",
    "    trait_conditions = selected.loc[selected['Trait'] == trait, 'Condition'].unique().tolist()\n",
    "    \n",
    "    # Create the new dictionary structure for this trait\n",
    "    new_task_info[trait] = {\n",
    "        'related_genes': t2g[trait],  # Original list of genes\n",
    "        'conditions': trait_conditions       # List of conditions\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(new_task_info['Acute_Myeloid_Leukemia']['related_genes'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-19T05:03:13.239507Z",
     "start_time": "2025-02-19T05:03:13.225479Z"
    }
   },
   "outputs": [],
   "source": [
    "new_task_info['Height']['related_genes']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-19T04:52:18.694656Z",
     "start_time": "2025-02-19T04:52:18.691006Z"
    }
   },
   "outputs": [],
   "source": [
    "for k in new_task_info:\n",
    "    print(k, len(new_task_info[k]['related_genes']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"../metadata/task_info.json\", \"w\") as f:\n",
    "    json.dump(new_task_info, f, indent=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"../metadata/task_info.json\", \"r\") as f:\n",
    "    new_task_info2 = json.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_task_info == new_task_info2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Refine trait-condition pairs\n",
    "Remove those that don't have common gene regressors."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'seaborn'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[1], line 4\u001b[0m\n\u001b[1;32m      2\u001b[0m sys\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mappend(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m..\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m      3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mtools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mstatistics\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n\u001b[0;32m----> 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m get_question_pairs\n",
      "File \u001b[0;32m~/Desktop/model/sparse-lmm/utils.py:11\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mtempfile\u001b[39;00m\n\u001b[1;32m      9\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mshutil\u001b[39;00m\n\u001b[0;32m---> 11\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mseaborn\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01msns\u001b[39;00m\n\u001b[1;32m     12\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mmatplotlib\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpyplot\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mplt\u001b[39;00m\n\u001b[1;32m     13\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mnumpy\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mnp\u001b[39;00m\n",
      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'seaborn'"
     ]
    }
   ],
   "source": [
    "import sys\n",
    "sys.path.append('..')\n",
    "from tools.statistics import *\n",
    "from utils.utils import get_question_pairs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_root = \"../output/preprocess/1\"\n",
    "task_info_path = \"../metadata/task_info.json\"\n",
    "with open(task_info_path, \"r\") as f:\n",
    "    task_info = json.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'Hypertension' in task_info['Hypertension']['conditions']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found 331 candidate genes that can be used in two-step regression analysis, such as ['AGTR1', 'ADRB1', 'ACE', 'EDNRA', 'ADRA1A', 'GUCY1B1', 'ADRA1B', 'NR3C2', 'REN', 'CACNA1C'].\n",
      "Found 45 candidate genes that can be used in two-step regression analysis, such as ['THPO', 'MPL', 'JAK2', 'PDE3A', 'PDE3B', 'CALR', 'RRM2', 'SH2B3', 'TET2', 'JAK1'].\n",
      "Found 330 candidate genes that can be used in two-step regression analysis, such as ['AGTR1', 'ADRB1', 'ACE', 'EDNRA', 'ADRA1A', 'GUCY1B1', 'ADRA1B', 'NR3C2', 'REN', 'CACNA1C'].\n",
      "Found 329 candidate genes that can be used in two-step regression analysis, such as ['AGTR1', 'ADRB1', 'ACE', 'EDNRA', 'ADRA1A', 'GUCY1B1', 'ADRA1B', 'NR3C2', 'REN', 'CACNA1C'].\n",
      "No available data, best cohorts being 'None' for the trait 'Alcohol_Flush_Reaction' and 'GSE128381' for the condition 'Hypertension'\n",
      "NO REGRESSORS FOR Alcohol_Flush_Reaction and Hypertension\n",
      "1\n",
      "Found 16 candidate genes that can be used in two-step regression analysis, such as ['ACE', 'GUCY1B1', 'GUCY1A1', 'NDUFS8', 'RUNX3', 'PIK3CG', 'TNF', 'TGFB2', 'MME', 'VIM'].\n",
      "Found 324 candidate genes that can be used in two-step regression analysis, such as ['AGTR1', 'ADRB1', 'ACE', 'EDNRA', 'ADRA1A', 'GUCY1B1', 'ADRA1B', 'NR3C2', 'REN', 'CACNA1C'].\n",
      "Found 297 candidate genes that can be used in two-step regression analysis, such as ['AGTR1', 'ACE', 'EDNRA', 'ADRA1A', 'GUCY1B1', 'ADRA1B', 'NR3C2', 'REN', 'CACNA1C', 'ADRB2'].\n",
      "No available data, best cohorts being 'None' for the trait 'Amyotrophic_Lateral_Sclerosis' and 'GSE67311' for the condition 'Fibromyalgia'\n",
      "NO REGRESSORS FOR Amyotrophic_Lateral_Sclerosis and Fibromyalgia\n",
      "2\n",
      "No available data, best cohorts being 'None' for the trait 'Amyotrophic_Lateral_Sclerosis' and 'GSE128381' for the condition 'Hypertension'\n",
      "NO REGRESSORS FOR Amyotrophic_Lateral_Sclerosis and Hypertension\n",
      "3\n",
      "No available data, best cohorts being 'None' for the trait 'Angelman_Syndrome' and 'None' for the condition 'Anxiety_disorder'\n",
      "NO REGRESSORS FOR Angelman_Syndrome and Anxiety_disorder\n",
      "4\n",
      "No available data, best cohorts being 'None' for the trait 'Angelman_Syndrome' and 'GSE61672' for the condition 'Generalized_Anxiety_Disorder'\n",
      "NO REGRESSORS FOR Angelman_Syndrome and Generalized_Anxiety_Disorder\n",
      "5\n",
      "No available data, best cohorts being 'None' for the trait 'Angelman_Syndrome' and 'GSE128381' for the condition 'Hypertension'\n",
      "NO REGRESSORS FOR Angelman_Syndrome and Hypertension\n",
      "6\n",
      "No available data, best cohorts being 'None' for the trait 'Aniridia' and 'GSE128381' for the condition 'Hypertension'\n",
      "NO REGRESSORS FOR Aniridia and Hypertension\n",
      "7\n",
      "No available data, best cohorts being 'None' for the trait 'Ankylosing_Spondylitis' and 'GSE66407' for the condition 'Crohns_Disease'\n",
      "NO REGRESSORS FOR Ankylosing_Spondylitis and Crohns_Disease\n",
      "8\n",
      "No available data, best cohorts being 'None' for the trait 'Ankylosing_Spondylitis' and 'GSE72625' for the condition 'Celiac_Disease'\n",
      "NO REGRESSORS FOR Ankylosing_Spondylitis and Celiac_Disease\n",
      "9\n",
      "No available data, best cohorts being 'None' for the trait 'Ankylosing_Spondylitis' and 'None' for the condition 'Asthma'\n",
      "NO REGRESSORS FOR Ankylosing_Spondylitis and Asthma\n",
      "10\n",
      "No available data, best cohorts being 'None' for the trait 'Ankylosing_Spondylitis' and 'GSE128381' for the condition 'Hypertension'\n",
      "NO REGRESSORS FOR Ankylosing_Spondylitis and Hypertension\n",
      "11\n",
      "No available data, best cohorts being 'None' for the trait 'Anorexia_Nervosa' and 'GSE99725' for the condition 'Depression'\n",
      "NO REGRESSORS FOR Anorexia_Nervosa and Depression\n",
      "12\n",
      "No available data, best cohorts being 'None' for the trait 'Anorexia_Nervosa' and 'GSE92538' for the condition 'Bipolar_disorder'\n",
      "NO REGRESSORS FOR Anorexia_Nervosa and Bipolar_disorder\n",
      "13\n",
      "No available data, best cohorts being 'None' for the trait 'Anorexia_Nervosa' and 'GSE128381' for the condition 'Hypertension'\n",
      "NO REGRESSORS FOR Anorexia_Nervosa and Hypertension\n",
      "14\n",
      "No available data, best cohorts being 'None' for the trait 'Anxiety_disorder' and 'None' for the condition 'Angelman_Syndrome'\n",
      "NO REGRESSORS FOR Anxiety_disorder and Angelman_Syndrome\n",
      "15\n",
      "No available data, best cohorts being 'None' for the trait 'Anxiety_disorder' and 'GSE61672' for the condition 'Generalized_Anxiety_Disorder'\n",
      "NO REGRESSORS FOR Anxiety_disorder and Generalized_Anxiety_Disorder\n",
      "16\n",
      "No available data, best cohorts being 'None' for the trait 'Anxiety_disorder' and 'GSE208668' for the condition 'Insomnia'\n",
      "NO REGRESSORS FOR Anxiety_disorder and Insomnia\n",
      "17\n",
      "No available data, best cohorts being 'None' for the trait 'Anxiety_disorder' and 'GSE67311' for the condition 'Fibromyalgia'\n",
      "NO REGRESSORS FOR Anxiety_disorder and Fibromyalgia\n",
      "18\n",
      "No available data, best cohorts being 'None' for the trait 'Anxiety_disorder' and 'GSE128381' for the condition 'Hypertension'\n",
      "NO REGRESSORS FOR Anxiety_disorder and Hypertension\n",
      "19\n",
      "No available data, best cohorts being 'None' for the trait 'Arrhythmia' and 'GSE235307' for the condition 'Atrial_Fibrillation'\n",
      "NO REGRESSORS FOR Arrhythmia and Atrial_Fibrillation\n",
      "20\n",
      "No available data, best cohorts being 'None' for the trait 'Arrhythmia' and 'GSE182600' for the condition 'Congestive_heart_failure'\n",
      "NO REGRESSORS FOR Arrhythmia and Congestive_heart_failure\n",
      "21\n",
      "No available data, best cohorts being 'None' for the trait 'Arrhythmia' and 'TCGA' for the condition 'Head_and_Neck_Cancer'\n",
      "NO REGRESSORS FOR Arrhythmia and Head_and_Neck_Cancer\n",
      "22\n",
      "No available data, best cohorts being 'None' for the trait 'Arrhythmia' and 'GSE162635' for the condition 'Chronic_obstructive_pulmonary_disease_(COPD)'\n",
      "NO REGRESSORS FOR Arrhythmia and Chronic_obstructive_pulmonary_disease_(COPD)\n",
      "23\n",
      "No available data, best cohorts being 'None' for the trait 'Arrhythmia' and 'None' for the condition 'Cardiovascular_Disease'\n",
      "NO REGRESSORS FOR Arrhythmia and Cardiovascular_Disease\n",
      "24\n",
      "No available data, best cohorts being 'None' for the trait 'Arrhythmia' and 'GSE67311' for the condition 'Fibromyalgia'\n",
      "NO REGRESSORS FOR Arrhythmia and Fibromyalgia\n",
      "25\n",
      "No available data, best cohorts being 'None' for the trait 'Arrhythmia' and 'GSE128381' for the condition 'Hypertension'\n",
      "NO REGRESSORS FOR Arrhythmia and Hypertension\n",
      "26\n",
      "No available data, best cohorts being 'None' for the trait 'Asthma' and 'GSE182740' for the condition 'Eczema'\n",
      "NO REGRESSORS FOR Asthma and Eczema\n",
      "27\n",
      "No available data, best cohorts being 'None' for the trait 'Asthma' and 'GSE66407' for the condition 'Crohns_Disease'\n",
      "NO REGRESSORS FOR Asthma and Crohns_Disease\n",
      "28\n",
      "No available data, best cohorts being 'None' for the trait 'Asthma' and 'None' for the condition 'Ankylosing_Spondylitis'\n",
      "NO REGRESSORS FOR Asthma and Ankylosing_Spondylitis\n",
      "29\n",
      "No available data, best cohorts being 'None' for the trait 'Asthma' and 'GSE162635' for the condition 'Chronic_obstructive_pulmonary_disease_(COPD)'\n",
      "NO REGRESSORS FOR Asthma and Chronic_obstructive_pulmonary_disease_(COPD)\n",
      "30\n",
      "No available data, best cohorts being 'None' for the trait 'Asthma' and 'GSE128381' for the condition 'Hypertension'\n",
      "NO REGRESSORS FOR Asthma and Hypertension\n",
      "31\n",
      "No available cohorts with common regressors for the trait 'Atherosclerosis' and the condition 'Huntingtons_Disease'\n",
      "NO REGRESSORS FOR Atherosclerosis and Huntingtons_Disease\n",
      "32\n",
      "No available cohorts with common regressors for the trait 'Atherosclerosis' and the condition 'Chronic_kidney_disease'\n",
      "NO REGRESSORS FOR Atherosclerosis and Chronic_kidney_disease\n",
      "33\n",
      "No available cohorts with common regressors for the trait 'Atherosclerosis' and the condition 'Chronic_obstructive_pulmonary_disease_(COPD)'\n",
      "NO REGRESSORS FOR Atherosclerosis and Chronic_obstructive_pulmonary_disease_(COPD)\n",
      "34\n",
      "No available cohorts with common regressors for the trait 'Atherosclerosis' and the condition 'Atrial_Fibrillation'\n",
      "NO REGRESSORS FOR Atherosclerosis and Atrial_Fibrillation\n",
      "35\n",
      "No available cohorts with common regressors for the trait 'Atherosclerosis' and the condition 'Epilepsy'\n",
      "NO REGRESSORS FOR Atherosclerosis and Epilepsy\n",
      "36\n",
      "No available cohorts with common regressors for the trait 'Atherosclerosis' and the condition 'Breast_Cancer'\n",
      "NO REGRESSORS FOR Atherosclerosis and Breast_Cancer\n",
      "37\n",
      "No available cohorts with common regressors for the trait 'Atherosclerosis' and the condition 'Coronary_artery_disease'\n",
      "NO REGRESSORS FOR Atherosclerosis and Coronary_artery_disease\n",
      "38\n",
      "No available cohorts with common regressors for the trait 'Atherosclerosis' and the condition 'Congestive_heart_failure'\n",
      "NO REGRESSORS FOR Atherosclerosis and Congestive_heart_failure\n",
      "39\n",
      "No available cohorts with common regressors for the trait 'Atherosclerosis' and the condition 'Hypertension'\n",
      "NO REGRESSORS FOR Atherosclerosis and Hypertension\n",
      "40\n",
      "No available cohorts with common regressors for the trait 'Atherosclerosis' and the condition 'Bipolar_disorder'\n",
      "NO REGRESSORS FOR Atherosclerosis and Bipolar_disorder\n",
      "41\n"
     ]
    }
   ],
   "source": [
    "data_root = \"../output/preprocess/1\"\n",
    "task_info_path = \"../metadata/task_info.json\"\n",
    "with open(task_info_path, \"r\") as f:\n",
    "    task_info = json.load(f)\n",
    "\n",
    "count = 0\n",
    "all_traits = sorted(list(task_info.keys()))\n",
    "for trait in all_traits:\n",
    "    for condition in task_info[trait]['conditions']:\n",
    "        if condition in ['Age', 'Gender', None]: continue\n",
    "        try:\n",
    "            trait_data, condition_data, regressor = select_and_load_cohort(data_root, trait, condition, True, task_info_path)\n",
    "            if regressor is None:\n",
    "                print(f\"NO REGRESSORS FOR {trait} and {condition}\" )\n",
    "                count += 1\n",
    "                print(count)\n",
    "        except:\n",
    "            continue\n",
    "print(count)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(file_path, 'r') as f:\n",
    "    task_info = json.load(f)\n",
    "        "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "agent",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
